General settings

In [1]:
import pandas as pd
import numpy as np
from itertools import product

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('notebook')

from IPython.display import display
In [2]:
drop_pos = ['dropoff_longitude','dropoff_latitude']
pick_pos = ['pickup_longitude', 'pickup_latitude']

lon_pos = ['pickup_longitude', 'dropoff_longitude']
lat_pos = ['pickup_latitude', 'dropoff_latitude']

position = ['pickup_longitude', 'pickup_latitude','dropoff_longitude','dropoff_latitude']


week = ['Sunday', 'Monday', 'Tuesday', 'Wednesday','Thursday', 'Friday', 'Saturday']

Initial exploration

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
display(train.head())
display(train.describe().style)
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N 429
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N 435
vendor_id passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude trip_duration
count 1.45864e+06 1.45864e+06 1.45864e+06 1.45864e+06 1.45864e+06 1.45864e+06 1.45864e+06
mean 1.53495 1.66453 -73.9735 40.7509 -73.9734 40.7518 959.492
std 0.498777 1.31424 0.0709019 0.0328812 0.0706433 0.0358906 5237.43
min 1 0 -121.933 34.3597 -121.933 32.1811 1
25% 1 1 -73.9919 40.7373 -73.9913 40.7359 397
50% 2 1 -73.9817 40.7541 -73.9798 40.7545 662
75% 2 2 -73.9673 40.7684 -73.963 40.7698 1075
max 2 9 -61.3355 51.8811 -61.3355 43.921 3.52628e+06

A partir da descrição do pandas, podemos notar, observando os campos min, max, e os quartis 1 e 3:

  • Existem viagens com 0 passageiros.
  • A duração mais curta é de 1 segundo.
  • A duração mais longa é > 3000 x Q3.

Preprocess datetimes, distance and speed

In [5]:
def add_date_specifics(df, column_name, new_prefix):
    df.loc[:,column_name] = pd.to_datetime(df[column_name])
    
    df[new_prefix +'year'] = df[column_name].dt.year
    df[new_prefix +'month'] = df[column_name].dt.month
    df[new_prefix +'yearday'] = df[column_name].dt.dayofyear
    df[new_prefix +'hour'] = df[column_name].dt.hour
    df[new_prefix +'minute'] = df[column_name].dt.minute
    df[new_prefix +'weekday'] = df[column_name].dt.weekday_name
    df[new_prefix +'weeknum'] = df[column_name].dt.weekday
    df[new_prefix +'weekend'] = train['pickup_weekday'].isin(['Saturday', 'Sunday'])

def add_distances(df):
    """
    Abaixo usamos a aproximação euclideana, ignorando a curvatura da terra,
    por tratarmos de uma unica cidade
    """
    ny_lat = 40.7 # From wikipedia
    R = 6371 # From wikipedia
    deg_rad_ratio = np.pi/180
    conversion = np.array([R*deg_rad_ratio, R*deg_rad_ratio*np.cos(ny_lat*deg_rad_ratio)])
    
    df['line_distance'] = np.sqrt((((df[pick_pos].values - df[drop_pos].values)*conversion)**2).sum(axis=1))
    df['manh_distance'] = np.abs(((df[pick_pos].values - df[drop_pos].values)*conversion)).sum(axis=1)
    #
    #df.apply(lambda x: great_circle(x[pick_pos], x[drop_pos]), axis=1)
    
def add_speed(df):
    """ Velocidade em km/h"""
    df['speed(km/h)'] = 3600*df['line_distance']/df['trip_duration']
    df['manhattan speed(km/h)'] = 3600*df['manh_distance']/df['trip_duration']    
In [6]:
whole = train.append(test)
In [7]:
add_date_specifics(train, 'pickup_datetime', 'pickup_')
add_date_specifics(train, 'dropoff_datetime', 'dropoff_')
add_distances(train)
add_speed(train)
train['log_trip_duration'] = np.log10(train['trip_duration']+1)

add_date_specifics(test, 'pickup_datetime', 'pickup_')
add_distances(test)

Anomalies and cleaning

In [8]:
train['duration_from_date'] = (train['dropoff_datetime']-train['pickup_datetime']).dt.total_seconds().astype(int)
print('Number of duration inconsistencies:', 
      (train.duration_from_date != train.trip_duration).sum(), 
      '(time provided vs time from datetime diff)')
Number of duration inconsistencies: 0 (time provided vs time from datetime diff)
In [9]:
display('Train:',train.isnull().sum(), 'Test: ', test.isnull().sum())
'Train:'
id                       0
vendor_id                0
pickup_datetime          0
dropoff_datetime         0
passenger_count          0
pickup_longitude         0
pickup_latitude          0
dropoff_longitude        0
dropoff_latitude         0
store_and_fwd_flag       0
trip_duration            0
pickup_year              0
pickup_month             0
pickup_yearday           0
pickup_hour              0
pickup_minute            0
pickup_weekday           0
pickup_weeknum           0
pickup_weekend           0
dropoff_year             0
dropoff_month            0
dropoff_yearday          0
dropoff_hour             0
dropoff_minute           0
dropoff_weekday          0
dropoff_weeknum          0
dropoff_weekend          0
line_distance            0
manh_distance            0
speed(km/h)              0
manhattan speed(km/h)    0
log_trip_duration        0
duration_from_date       0
dtype: int64
'Test: '
id                    0
vendor_id             0
pickup_datetime       0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
pickup_year           0
pickup_month          0
pickup_yearday        0
pickup_hour           0
pickup_minute         0
pickup_weekday        0
pickup_weeknum        0
pickup_weekend        0
line_distance         0
manh_distance         0
dtype: int64

Podemos ver que não há valores faltando nem no conjunto de treino nem no de teste.


Limpeza por distância mínima
In [11]:
plt.figure(figsize=(12,4))

plt.subplot(1,3,1)
sns.distplot(np.log10(train['line_distance']+1), kde=False); plt.xlabel('log (line_distance)');
plt.title('Distancia das viagens em log')
plt.subplot(1,3,2)
plt.title('Distancia das viagens curtas < 1km')
sns.distplot(train['line_distance'][train['line_distance']<1], kde=False);
plt.subplot(1,3,3)
plt.title('Distancia das viagens curtas < .1km')
sns.distplot(train['line_distance'][train['line_distance']<.1], kde=False);

plt.tight_layout()

Notamos que existem muitas viagens que não mudam de lugar, provavelmente problemas na utilização do dispositivo pelos taxistas.

Removeremos as viagens com tamanho < .1km, que é aproximadamente 1 quarteirão. As viagens de tamanho 0 serão removidas juntamente.


In [12]:
train = train[train['line_distance']>.1]

Limpeza por velocidade máxima
In [13]:
plt.figure(figsize=(12,4))

plt.subplot(1,3,1)
sns.distplot(np.log10(train['speed(km/h)']+1)); plt.xlabel('log( speed(km/h) )');

plt.subplot(1,3,2)
ax=sns.distplot(train['speed(km/h)'][train['speed(km/h)'] > 200], kde=False); 

plt.subplot(1,3,3)
ax=sns.distplot(train['speed(km/h)'][train['speed(km/h)'] < 200], kde=False); 

De acordo com esse site e esse, a velocidade maxima em Nova York é 65mph (~104km/h), somente nas rodovias rurais.

Achamos razoável assumir um limite máximo de 200km/h para o nosso dataset, levando em conta que as distâncias medidas são aproximadas. Essa decisão é bastante conservadora, visto que a distância em linha reta tenderá a subestimar as distâncias e as velocidades, e 200km/h é uma velocidade bastante difícil de ser alcançada.


In [14]:
train = train[train['speed(km/h)'] < 200]

Limpeza por passageiros
In [40]:
train['log_line_distance'] = np.log10(train['line_distance'])
train['log( speed(km/h) )'] = np.log10(train['speed(km/h)'])

plt.figure(figsize=(18,6))

ax=plt.subplot(1,4,1)
sns.barplot(x='passenger_count', y=0, hue='vendor_id', data=train.groupby(['vendor_id','passenger_count']).apply(len).reset_index(), ax = ax, palette = 'Pastel2')

ax=plt.subplot(1,4,2)
sns.violinplot(x='passenger_count', y='log_line_distance', hue='vendor_id', data=train, ax = ax, palette = 'Pastel2')

ax=plt.subplot(1,4,3)
sns.violinplot(x='passenger_count', y='log_trip_duration', hue='vendor_id', data=train, ax = ax, palette = 'Pastel2')

ax=plt.subplot(1,4,4)
sns.violinplot(x='passenger_count', y='log( speed(km/h) )', hue='vendor_id', data=train, ax = ax, palette = 'Pastel2')

plt.tight_layout()

Podemos ver que há poucas viagens com 0 passageiros, e que as distribuições são semelhantes aos outros tipos de viagem.

As viagens com passageiros > 6 foram removidas em passos anteriores, da distância mínima e velocidade.


Limpeza por duração máxima
In [59]:
plt.figure(figsize=(12,10))

plt.subplot(3,3,1)
sns.distplot(train['log_trip_duration']); plt.title('Duração das viagens')
plt.subplot(3,3,2)
sns.distplot(train['trip_duration'][train['trip_duration']<60], kde=False); plt.title('Duração das viagens < 1min (zoom)')
plt.subplot(3,3,3)
sns.distplot(train['log_trip_duration'][train['log_trip_duration']>4.4], kde=False); 
plt.title('Duração das viagens > 7h (zoom)'); plt.ylim(0,50);


plt.subplot(3,3,4)
sns.distplot(np.log10(train['speed(km/h)']+1)); plt.xlabel('log( speed(km/h) )');
plt.subplot(3,3,5)
sns.distplot( train['speed(km/h)'][train['trip_duration'] < 20], kde=False ); 
plt.title('Velocidade nas viagens curtas')
plt.subplot(3,3,6)
ax=sns.distplot( train['speed(km/h)'][train['log_trip_duration']>4.4], kde=False )
plt.title('Velocidade nas viagens longas')

#plt.xlabel('log( speed(km/h) )');
plt.subplot(3,3,7)
sns.distplot( train['log_line_distance'], kde=True )
plt.subplot(3,3,8)
sns.distplot( train['line_distance'][train['trip_duration'] < 20], kde=False )
plt.title('Distancia das viagens curtas')
plt.subplot(3,3,9)
sns.distplot( train['line_distance'][train['log_trip_duration']> 4.4], kde=False )
plt.title('Distancia das viagens longas')
plt.tight_layout()

Não parece razoável haverem tantas viagens com duração próxima a 1 dia, (log_trip_duration = 5), mas a duração > 10 dias é completamente não razoável, e provavelmente houve problema no dispositivo, que ficou ligado sem parar.

In [63]:
train = train[train.log_trip_duration<6]
Limpeza por localização
In [64]:
plt.figure(figsize=(12,4))

plt.subplot(1,3,1)
sns.boxplot(x='variable', y='value', data=train[lon_pos].melt())
plt.subplot(1,3,2)
sns.boxplot(x='variable', y='value', data=train[lon_pos].melt()); plt.ylim(-85, -60)
ax=plt.subplot(2,3,3)
train[lon_pos][train[lon_pos]>-90].quantile(np.arange(100)/100).plot(ax=ax)
ax=plt.subplot(2,3,6)
train[lon_pos][train[lon_pos]>-90].quantile(np.arange(20)/100).plot(ax=ax,legend=False);
ax=ax.twinx()
train[lon_pos][train[lon_pos]>-90].quantile(np.arange(10,100)/100).plot(ax=ax,legend=False);

plt.tight_layout()
In [21]:
plt.figure(figsize=(32,16))

plt.subplot(1,2,1)
plt.scatter(train['pickup_longitude'], train['pickup_latitude'], s=.3);
plt.xlim(-74.05, -73.77); plt.ylim(40.55,40.93); plt.title('Pickup')

plt.subplot(1,2,2)
plt.scatter(train['dropoff_longitude'], train['dropoff_latitude'], s=.3);
plt.xlim(-74.05, -73.77); plt.ylim(40.55,40.93); plt.title('Dropoff')
Out[21]:
Text(0.5,1,'Dropoff')

Manualmente e por iterações, foram decididos limites para as coordenadas de latitude e longitude que mantivessem as caracteristicas visualmente importantes no mapa, e que serão usadas nos mapas a partir daqui.

No mapa dos pickups é fácil notar dois 'bumps' fora da cidade de Manhattan. Uma pesquisa rápida no google maps mostra que se tratam do aeroporto regional de La Guardia, mais próximo à cidade, e do aeroporto internacional Jonh F. Kennedy, mais distante.

In [23]:
min_lon, max_lon = -74.05, -73.77
min_lat, max_lat = 40.55, 40.93

min_pos = pd.Series(dict(zip(position,[min_lon,min_lat, min_lon, min_lat])))
max_pos = pd.Series(dict(zip(position,[max_lon,max_lat, max_lon, max_lat])))
is_central = ((train[position] > min_pos) & (train[position] < max_pos)).sum(axis=1)==4
In [24]:
plt.figure(figsize=(10,10))

ax = plt.subplot(2,2,1)
sns.distplot(train[is_central]['pickup_longitude'])
ax = plt.subplot(2,2,2)
sns.distplot(train[is_central]['dropoff_longitude'])
ax = plt.subplot(2,2,3)
sns.distplot(train[is_central]['pickup_latitude'])
ax = plt.subplot(2,2,4)
sns.distplot(train[is_central]['dropoff_latitude'])

plt.tight_layout()

Visual quantile analysis

In [41]:
quants = pd.DataFrame()
for p in np.arange(0,1.001,.001):
    quants = quants.append(train[position].quantile(p))
In [42]:
quants[lon_pos].reset_index().set_index(['pickup_longitude'])['index'].plot(label='pickup longitude')
quants[lon_pos].reset_index().set_index(['dropoff_longitude'])['index'].plot(linestyle='--',label='dropoff longitude')
plt.legend(loc='center right')

plt.twiny()
quants[lat_pos].reset_index().set_index(['pickup_latitude'])['index'].plot(color='g',label='pickup latitude')
quants[lat_pos].reset_index().set_index(['dropoff_latitude'])['index'].plot(linestyle='--',color='k', label='dropoff latitude')

plt.ylabel('Cumulative probability')
plt.legend(loc='upper left')
plt.title('')
Out[42]:
Text(0.5,1,'')

É possível ver que ainda possuimos outliers muito distantes tanto para latitude quanto longitude.

Trip duration conditioned

In [43]:
plt.figure(figsize=(18,8))


ax=plt.subplot(2,3,1)
sns.barplot(x='vendor_id', y=0, data= train.groupby('vendor_id').apply(len).reset_index(), palette = 'Pastel2')
plt.ylabel('occurence')

ax=plt.subplot(2,3,2)
sns.violinplot(x='passenger_count', y='log_trip_duration', hue='vendor_id', data=train, ax = ax, palette = 'Pastel2')

ax=plt.subplot(2,3,3)
sns.barplot(x='passenger_count', y=0, hue='vendor_id', data=train.groupby(['vendor_id','passenger_count']).apply(len).reset_index(), ax = ax, palette = 'Pastel2')

ax=plt.subplot2grid((2,3),(1,0))
sns.barplot(x='pickup_hour', y=0, hue='vendor_id', palette = 'Pastel2', 
             data = train.groupby(['vendor_id', 'pickup_hour']).apply(len).reset_index())
plt.title('Number of drives per hour of day')

ax=plt.subplot2grid((2,3),(1,1))
sns.barplot(x='pickup_hour', y='trip_duration', data=train, ax=ax, hue='vendor_id', palette = 'Pastel2')
plt.title('Mean duration of trips')

ax=plt.subplot2grid((2,3),(1,2))
sns.barplot(x='pickup_hour', y='speed(km/h)', data=train, ax=ax, hue='vendor_id', palette = 'Pastel2')
plt.title('Mean speed of trips')

plt.tight_layout()
  • A feature vendor_id provavelmente se trata da empresa de taxi.
  • A maior parte das viagens com mais de 4 passageiros é da empresa 2.
  • A maior parte das viagens de taxi possui apenas 1 passageiro.
In [44]:
plt.figure(figsize=(18,10))

# Weekday-------------
ax=plt.subplot2grid((3,3),(0,0))
train.pickup_weekday.value_counts()[week].plot.bar(ax=ax, rot=15)
plt.title('Number of drives per day of week'); plt.ylim(150000); plt.ylabel('Number of drives')

ax=plt.subplot2grid((3,3),(1,0))
sns.barplot(x='pickup_weekday', y='trip_duration', data=train, order=week, ax=ax)
plt.title('Mean duration of trips'); plt.ylim(400);

ax=plt.subplot2grid((3,3),(2,0))
sns.barplot(x='pickup_weekday', y='speed(km/h)', data=train, order=week, ax=ax)
plt.title('Mean speed of trips'); 

# Hour -------------
ax=plt.subplot2grid((3,3),(0,1))
sns.barplot(x='pickup_hour', y=0, hue='pickup_weekend', palette = 'Set3', 
             data = train.groupby(['pickup_weekend', 'pickup_hour']).apply(len).reset_index())
plt.title('Number of drives per hour of day'); plt.ylabel('Number of drives')

ax=plt.subplot2grid((3,3),(1,1))
sns.barplot(x='pickup_hour', y='trip_duration', data=train, ax=ax, hue='pickup_weekend', palette = 'Set3')
plt.title('Mean duration of trips'); plt.ylim(400);

ax=plt.subplot2grid((3,3),(2,1))
sns.barplot(x='pickup_hour', y='speed(km/h)', data=train, ax=ax, hue='pickup_weekend', palette = 'Set3')
plt.title('Mean speed of trips');

# Month-------------
ax=plt.subplot2grid((3,3),(0,2))
sns.barplot(x='pickup_month', y=0,
                data=train.groupby('pickup_month').apply(len).reset_index(), palette='Set2')
plt.title('Number of drives per month of year'); plt.ylim(150000); plt.ylabel('Number of drives')

ax = plt.subplot2grid((3,3),(1,2))
sns.barplot(x='pickup_month', y='trip_duration', data=train, ax=ax, palette='Set2')
plt.title('Mean duration of trips'); plt.ylim(400);

ax=plt.subplot2grid((3,3),(2,2))
sns.barplot(x='pickup_month', y='speed(km/h)', data=train, ax=ax, palette='Set2')
plt.title('Mean speed of trips');


plt.tight_layout()

É possivel notar uma diferença clara entre os dias da semana e os fins de semana, entre as 5 da manha e as 18. As viagens costumam ser muito mais lentas nos dias da semana, o que provavelmente se deve ao commute, da ida e volta diária ao trabalho.

Sem atenção às horas, ainda é possivel ver uma diferença de velocidade nas viagens, mais lentas no meio da semana, e que curiosamente se mantém altas nas segundas feiras, mesmo sendo um dia de trabalho.

City mapping

In [45]:
def citymap(df, pos = pick_pos, func=len, precision=4000):
    lats = np.arange( int(precision*train[is_central]['dropoff_latitude'].min()),
                      int(precision*train[is_central]['dropoff_latitude'].max())+1)
    lons = np.arange( int(precision*train[is_central]['dropoff_longitude'].min()),
                      int(precision*train[is_central]['dropoff_longitude'].max())+1)
    
    densemap = np.zeros((lons.shape[0], lats.shape[0]))
    local = df.copy()
    local[pos] = local[pos].apply(np.around, decimals=4)
    sparse_vals = local.groupby(pos).apply(func)
    for (lon, lat), count in sparse_vals.iteritems():
        i = int(precision*lon-lons[0])
        j = int(precision*lat-lats[0])
        if i < densemap.shape[0] and j < densemap.shape[1] and i>=0 and j>=0:
            densemap[i,j] = count
    return pd.DataFrame(densemap, index = pd.Index(lons/precision, name='Longitude'), columns=pd.Index(lats/precision, name='Latitude'))
In [46]:
pickmap = np.log10(citymap(train[is_central]) +1)
dropmap = np.log10(citymap(train[is_central], pos = drop_pos) +1)
In [47]:
plt.figure(figsize=(20,30))
ax = plt.subplot(2,1,1)
sns.heatmap(pickmap.T[::-1]); plt.title('Pick-ups')
ax = plt.subplot(2,1,2)
sns.heatmap(dropmap.T[::-1]); plt.title('Drop-offs');
In [48]:
del(pickmap)
del(dropmap)

Observando os mapas de pickup e dropoff, e possivel notar que existe muito mais consistencia nos pickups, o que e bastante razoavel, levando em conta que os taxistas devem pegar passageiros em vias movimentadas e em seus respectivos pontos de taxi, e provavelmente nao passam muito tempo procurando passageiros em vias pequenas. Essa analise considera que o meio mais comum de pegar taxi, para essas duas empresas em questao, ainda e "presencial", em oposiçao a atraves de aplicativos.

Outra possibilidade e que os passageiros tenham menor tendencia a pegar taxis na ida para a cidade, talvez pela maior facilidade do transporte publico nessa direçao, e peguem mais comumente na volta para casa, ou em viagens relacionadas a trabalho dentro da propria cidade. Essa hipotese bate com o fato de que o numero de taxis e muito maior nas horas entre 7h e 23h, em uma diferença que ocorre principalmente nos dias da semana, enquanto no fim de semana a quantidade de taxis e menos variavel.

O mais provavel e que ambos os fatores exerçam alguma influencia sobre os dados.

In [49]:
plt.figure(figsize=(80,12))

for i in range(8):
    plt.subplot(1,8,i+1)
    to_plot = (train['dropoff_hour']==i+5) & is_central
    plt.scatter(train[to_plot]['dropoff_longitude'], train[to_plot]['dropoff_latitude'], 
                    s=5, alpha=.4, c=np.log10(train[to_plot]['speed(km/h)']), cmap='plasma', vmin=.7, vmax=1.7)
    plt.title('Dropoff at %dam'%(i+5))
    plt.axis('off')
  • É possível ver o aumento do trânsito em Manhattan nas primeiras horas do dia, junto com o aumento da densidade de viagens.
  • As viagens para o aeroporto JFK continuam sendo bastante rápidas em geral.
  • Não há mudança aparente de densidade das viagens para fora de Manhattan.
In [50]:
plt.figure(figsize=(80,12))

for i in range(8):
    plt.subplot(1,8,i+1)
    to_plot = (train['dropoff_hour']==i+16) & is_central
    plt.scatter(train[to_plot]['dropoff_longitude'], train[to_plot]['dropoff_latitude'], 
                    s=5, alpha=.4, c=np.log10(train[to_plot]['speed(km/h)']), cmap='plasma', vmin=.7, vmax=1.7)
    plt.title('Dropoff at %dpm'%(i+4))
    plt.axis('off')
  • É possível perceber que, enquanto a densidade de viagens terminando no centro de Manhattan diminue com a chegada da noite, ela aumenta perceptivelmente nos arredores, indicando provavelmente a volta de trabalhadores para suas casas.

  • A velocidade média das viagens, indicada pela cor dos pontos, indica uma melhora gradual no transito, mais rapida na periferia, especialmente mais distante da cidade entre os aeroportos. Às 23h ainda sao lentas as viagens para a parte mais central da cidade.

Feature engineering

In [28]:
raw_feat_list = ['vendor_id','pickup_hour','pickup_month','pickup_weeknum', 'store_and_fwd_flag',
                 'pickup_weekend', 'line_distance', 'manh_distance'] + position
onehot_feat_list = ['pickup_hour', 'pickup_month', 'pickup_weekday']
In [23]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse=False)

def one_hot_df(df, feature):    
    onehot = OneHotEncoder(sparse=False)
    dense_X = onehot.fit_transform(df[feature].values.reshape(-1,1))
    return pd.DataFrame( dense_X, columns = ['%s_%d'%(feature, i) for i in onehot.active_features_] )
In [100]:
X = pd.get_dummies(train[onehot_feat_list], columns = onehot_feat_list).join(train[raw_feat_list])
X['store_and_fwd_flag'] = X['store_and_fwd_flag'] == 'Y' 

y = train.log_trip_duration

Xtest = pd.get_dummies(test[onehot_feat_list], columns = onehot_feat_list).join(test[raw_feat_list])
Xtest['store_and_fwd_flag'] = Xtest['store_and_fwd_flag'] == 'Y' 

Classification

In [74]:
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import cross_val_score

Error metric

In [66]:
def rmsle(y_true,y_pred):
    """
    From https://www.kaggle.com/wiki/RootMeanSquaredLogarithmicError
    """
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

def rmse(y_true,y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(y_pred - y_true).mean() ** 0.5

from sklearn.metrics import make_scorer

rmsle_score = make_scorer(rmsle)
rmse_score = make_scorer(rmse)

Hyperparameter tuning

In [128]:
from skopt import gp_minimize

def optimize(X, y, n_calls=80, n_random_starts=10):
    hyperparameter_space = [(1,15),
                            (0.01,1.),
                            (10,300),
                            (1e-6,10.,"log-uniform"),
                            (1,30),
                            (1e-6,1.,"log-uniform"),
                            (1e-6,1.,"log-uniform"),
                            (1e-2,1.,"log-uniform"),
                            (1e-2,1.,"log-uniform")]
    hyperparameter_names = ['max_depth', 'learning_rate', 'n_estimators', 
                            'gamma', 'min_child_weight', 'reg_alpha', 'reg_lambda', 
                            'colsample_bytree', 'subsample']
    def objective_(params):
        parameters = dict(zip(hyperparameter_names, params))
        print(parameters)
        score = np.mean(np.log(10)*cross_val_score(XGBRegressor(**parameters), X, y, cv=3, scoring=rmse_score))
        return score

    return gp_minimize(objective_, hyperparameter_space, n_calls=n_calls,n_random_starts=n_random_starts, n_jobs=3,verbose=3)
In [129]:
%%time
res = optimize(X, y)
Iteration No: 1 started. Evaluating function at random point.
{'max_depth': 11, 'learning_rate': 0.4739324850107493, 'n_estimators': 290, 'gamma': 4.2811895520041903, 'min_child_weight': 18, 'reg_alpha': 3.3468483228294448e-06, 'reg_lambda': 0.13024497746209746, 'colsample_bytree': 0.036292048691664865, 'subsample': 0.35198233131940876}
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 168.5138
Function value obtained: 0.4446
Current minimum: 0.4446
Iteration No: 2 started. Evaluating function at random point.
{'max_depth': 14, 'learning_rate': 0.11826222492800004, 'n_estimators': 244, 'gamma': 0.0069130302053384326, 'min_child_weight': 7, 'reg_alpha': 0.047908749381150535, 'reg_lambda': 8.7162234621082748e-06, 'colsample_bytree': 0.010742862126743264, 'subsample': 0.055651440607191259}
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 141.4018
Function value obtained: 0.4910
Current minimum: 0.4446
Iteration No: 3 started. Evaluating function at random point.
{'max_depth': 1, 'learning_rate': 0.89516975141011779, 'n_estimators': 220, 'gamma': 0.057223303594064107, 'min_child_weight': 12, 'reg_alpha': 0.0010156624121179828, 'reg_lambda': 1.9746538779435532e-05, 'colsample_bytree': 0.019961727867445878, 'subsample': 0.86239138944307092}
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 81.5053
Function value obtained: 0.4737
Current minimum: 0.4446
Iteration No: 4 started. Evaluating function at random point.
{'max_depth': 6, 'learning_rate': 0.83844978987033647, 'n_estimators': 244, 'gamma': 1.5869561367802096, 'min_child_weight': 16, 'reg_alpha': 0.052833936776964893, 'reg_lambda': 1.7477413040540027e-06, 'colsample_bytree': 0.015859908800482818, 'subsample': 0.019460139290036253}
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 103.1918
Function value obtained: 0.4529
Current minimum: 0.4446
Iteration No: 5 started. Evaluating function at random point.
{'max_depth': 10, 'learning_rate': 0.94143852666688044, 'n_estimators': 103, 'gamma': 0.020096062398424919, 'min_child_weight': 26, 'reg_alpha': 4.6931047572738753e-05, 'reg_lambda': 0.028156811964662805, 'colsample_bytree': 0.1690430267818202, 'subsample': 0.12319972618069623}
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 275.7987
Function value obtained: 0.4839
Current minimum: 0.4446
Iteration No: 6 started. Evaluating function at random point.
{'max_depth': 14, 'learning_rate': 0.70574121463244033, 'n_estimators': 108, 'gamma': 0.00063702185175630436, 'min_child_weight': 5, 'reg_alpha': 0.98794388283997525, 'reg_lambda': 1.7056205558586948e-05, 'colsample_bytree': 0.81887247475650937, 'subsample': 0.54995945360180054}
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 2122.8010
Function value obtained: 0.4280
Current minimum: 0.4280
Iteration No: 7 started. Evaluating function at random point.
{'max_depth': 7, 'learning_rate': 0.70027084094542835, 'n_estimators': 236, 'gamma': 0.056388926256922045, 'min_child_weight': 3, 'reg_alpha': 0.020503953965051811, 'reg_lambda': 0.009946814873628064, 'colsample_bytree': 0.17625170574439769, 'subsample': 0.089809359903611483}
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 431.1337
Function value obtained: 0.7101
Current minimum: 0.4280
Iteration No: 8 started. Evaluating function at random point.
{'max_depth': 12, 'learning_rate': 0.30788211706725255, 'n_estimators': 294, 'gamma': 0.0024461114451665451, 'min_child_weight': 9, 'reg_alpha': 0.010728666231532354, 'reg_lambda': 9.3435150468498503e-06, 'colsample_bytree': 0.031539751237482039, 'subsample': 0.015010603427409458}
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 153.3530
Function value obtained: 0.4715
Current minimum: 0.4280
Iteration No: 9 started. Evaluating function at random point.
{'max_depth': 11, 'learning_rate': 0.51195712833530571, 'n_estimators': 154, 'gamma': 0.068125053822452972, 'min_child_weight': 8, 'reg_alpha': 0.0055008840821208892, 'reg_lambda': 1.9340256271518975e-06, 'colsample_bytree': 0.13989346098573954, 'subsample': 0.14761590044773132}
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 357.3126
Function value obtained: 0.4446
Current minimum: 0.4280
Iteration No: 10 started. Evaluating function at random point.
{'max_depth': 9, 'learning_rate': 0.82596946608322686, 'n_estimators': 104, 'gamma': 0.00011225349446974439, 'min_child_weight': 6, 'reg_alpha': 0.038713606446097851, 'reg_lambda': 6.0053602789226048e-05, 'colsample_bytree': 0.027244881644278039, 'subsample': 0.028380845034260981}
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 72.5921
Function value obtained: 0.4887
Current minimum: 0.4280
Iteration No: 11 started. Searching for the next optimal point.
{'max_depth': 3, 'learning_rate': 0.64963196601929551, 'n_estimators': 20, 'gamma': 7.5226261105822164e-06, 'min_child_weight': 30, 'reg_alpha': 0.00069382159304043497, 'reg_lambda': 1.0, 'colsample_bytree': 1.0, 'subsample': 0.97485666855402753}
Iteration No: 11 ended. Search finished for the next optimal point.
Time taken: 99.2720
Function value obtained: 0.4216
Current minimum: 0.4216
Iteration No: 12 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.49729446345443035, 'n_estimators': 248, 'gamma': 2.6259628179518608, 'min_child_weight': 27, 'reg_alpha': 9.9999999999999995e-07, 'reg_lambda': 0.33299146208641556, 'colsample_bytree': 0.03546407901815328, 'subsample': 0.57611555431284966}
Iteration No: 12 ended. Search finished for the next optimal point.
Time taken: 152.5528
Function value obtained: 0.4440
Current minimum: 0.4216
Iteration No: 13 started. Searching for the next optimal point.
{'max_depth': 1, 'learning_rate': 0.7769189606098803, 'n_estimators': 10, 'gamma': 9.9999999999999995e-07, 'min_child_weight': 20, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'colsample_bytree': 1.0, 'subsample': 1.0}
Iteration No: 13 ended. Search finished for the next optimal point.
Time taken: 30.8313
Function value obtained: 0.4791
Current minimum: 0.4216
Iteration No: 14 started. Searching for the next optimal point.
{'max_depth': 14, 'learning_rate': 1.0, 'n_estimators': 10, 'gamma': 7.8895047698848787e-05, 'min_child_weight': 3, 'reg_alpha': 1.0, 'reg_lambda': 3.0827306560673407e-05, 'colsample_bytree': 1.0, 'subsample': 1.0}
Iteration No: 14 ended. Search finished for the next optimal point.
Time taken: 223.5436
Function value obtained: 0.4023
Current minimum: 0.4023
Iteration No: 15 started. Searching for the next optimal point.
{'max_depth': 13, 'learning_rate': 0.98998679638653464, 'n_estimators': 218, 'gamma': 0.0036746613501220013, 'min_child_weight': 14, 'reg_alpha': 0.0013597724810523984, 'reg_lambda': 0.42262776928157203, 'colsample_bytree': 0.14503451096819589, 'subsample': 0.99514968300071538}
Iteration No: 15 ended. Search finished for the next optimal point.
Time taken: 745.7694
Function value obtained: 0.4586
Current minimum: 0.4023
Iteration No: 16 started. Searching for the next optimal point.
{'max_depth': 3, 'learning_rate': 0.68382946879961215, 'n_estimators': 104, 'gamma': 2.3983337362827677e-05, 'min_child_weight': 22, 'reg_alpha': 0.0012685684015175227, 'reg_lambda': 0.0146513914172186, 'colsample_bytree': 0.801183295904766, 'subsample': 0.71606448190074201}
Iteration No: 16 ended. Search finished for the next optimal point.
Time taken: 383.0917
Function value obtained: 0.4011
Current minimum: 0.4011
Iteration No: 17 started. Searching for the next optimal point.
{'max_depth': 3, 'learning_rate': 1.0, 'n_estimators': 135, 'gamma': 0.057353131100693372, 'min_child_weight': 1, 'reg_alpha': 0.03581473391248971, 'reg_lambda': 2.0137239124735577e-05, 'colsample_bytree': 0.97904095424646198, 'subsample': 0.076448217400905738}
Iteration No: 17 ended. Search finished for the next optimal point.
Time taken: 388.6795
Function value obtained: 1406.0548
Current minimum: 0.4011
Iteration No: 18 started. Searching for the next optimal point.
{'max_depth': 15, 'learning_rate': 0.49761469342966463, 'n_estimators': 287, 'gamma': 0.35557637937903325, 'min_child_weight': 21, 'reg_alpha': 0.00014365132186246895, 'reg_lambda': 4.4145768973648607e-06, 'colsample_bytree': 0.011551717448960042, 'subsample': 0.02660331530329673}
Iteration No: 18 ended. Search finished for the next optimal point.
Time taken: 157.3269
Function value obtained: 0.4587
Current minimum: 0.4011
Iteration No: 19 started. Searching for the next optimal point.
{'max_depth': 2, 'learning_rate': 0.021840130390973031, 'n_estimators': 144, 'gamma': 0.19516315397306472, 'min_child_weight': 12, 'reg_alpha': 0.40011060761003631, 'reg_lambda': 0.067885502404654172, 'colsample_bytree': 0.028530827630855897, 'subsample': 0.19762875297591262}
Iteration No: 19 ended. Search finished for the next optimal point.
Time taken: 67.0918
Function value obtained: 0.7352
Current minimum: 0.4011
Iteration No: 20 started. Searching for the next optimal point.
{'max_depth': 11, 'learning_rate': 0.21476942062372031, 'n_estimators': 287, 'gamma': 2.3074723022921413e-06, 'min_child_weight': 16, 'reg_alpha': 0.5815787682811846, 'reg_lambda': 0.60169547700624582, 'colsample_bytree': 0.78028984074973284, 'subsample': 0.36043975312396048}
Iteration No: 20 ended. Search finished for the next optimal point.
Time taken: 3508.8745
Function value obtained: 0.3737
Current minimum: 0.3737
Iteration No: 21 started. Searching for the next optimal point.
{'max_depth': 9, 'learning_rate': 0.9362069894965328, 'n_estimators': 183, 'gamma': 0.0045349173437393957, 'min_child_weight': 22, 'reg_alpha': 0.15611774079275118, 'reg_lambda': 0.033687016019141862, 'colsample_bytree': 0.19419786523116284, 'subsample': 0.13865845608444269}
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 484.7134
Function value obtained: 0.4775
Current minimum: 0.3737
Iteration No: 22 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.36978306631574698, 'n_estimators': 281, 'gamma': 0.0055727571099460724, 'min_child_weight': 9, 'reg_alpha': 0.0076505053306795889, 'reg_lambda': 1.0582672382584207e-05, 'colsample_bytree': 0.016707082368391305, 'subsample': 0.018246912186417056}
Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 149.6607
Function value obtained: 0.4758
Current minimum: 0.3737
Iteration No: 23 started. Searching for the next optimal point.
{'max_depth': 10, 'learning_rate': 0.93999238121951156, 'n_estimators': 48, 'gamma': 0.020319396939933791, 'min_child_weight': 26, 'reg_alpha': 0.065759879548594274, 'reg_lambda': 1.5140352968970362e-06, 'colsample_bytree': 0.031231135951095484, 'subsample': 0.12129658012689201}
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 37.9676
Function value obtained: 0.4968
Current minimum: 0.3737
Iteration No: 24 started. Searching for the next optimal point.
{'max_depth': 11, 'learning_rate': 0.51016714111432737, 'n_estimators': 124, 'gamma': 0.070566607364587988, 'min_child_weight': 8, 'reg_alpha': 0.42653489592006205, 'reg_lambda': 0.0092771515475972132, 'colsample_bytree': 0.14650329157090727, 'subsample': 0.14705497173314333}
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 330.5813
Function value obtained: 0.4283
Current minimum: 0.3737
Iteration No: 25 started. Searching for the next optimal point.
{'max_depth': 13, 'learning_rate': 0.9610276906468781, 'n_estimators': 278, 'gamma': 0.010164352204053963, 'min_child_weight': 14, 'reg_alpha': 1.53157940284104e-05, 'reg_lambda': 4.8680894106580891e-06, 'colsample_bytree': 0.010634494239069908, 'subsample': 0.81024602555950098}
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 171.9574
Function value obtained: 0.4535
Current minimum: 0.3737
Iteration No: 26 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.36770028398945204, 'n_estimators': 235, 'gamma': 0.0056347620684666306, 'min_child_weight': 9, 'reg_alpha': 0.6418895859598388, 'reg_lambda': 0.00021325591866996969, 'colsample_bytree': 0.017558058031293424, 'subsample': 0.018425588707767643}
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 126.9844
Function value obtained: 0.4610
Current minimum: 0.3737
Iteration No: 27 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.50257916618155674, 'n_estimators': 114, 'gamma': 2.683090797106523, 'min_child_weight': 27, 'reg_alpha': 3.7654249374393125e-06, 'reg_lambda': 0.0037058910835159661, 'colsample_bytree': 0.34506213118083046, 'subsample': 0.57685424698331722}
Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 822.9417
Function value obtained: 0.3904
Current minimum: 0.3737
Iteration No: 28 started. Searching for the next optimal point.
{'max_depth': 6, 'learning_rate': 0.84084259603941591, 'n_estimators': 199, 'gamma': 1.4696104943350279, 'min_child_weight': 16, 'reg_alpha': 0.10690781038722465, 'reg_lambda': 8.2034154610154399e-06, 'colsample_bytree': 0.3906453104790768, 'subsample': 0.019370761302105879}
Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 487.7290
Function value obtained: 0.4903
Current minimum: 0.3737
Iteration No: 29 started. Searching for the next optimal point.
{'max_depth': 3, 'learning_rate': 0.68357190791851585, 'n_estimators': 284, 'gamma': 2.517531364564411e-05, 'min_child_weight': 22, 'reg_alpha': 0.0018825146074910289, 'reg_lambda': 1.3229432618327712e-06, 'colsample_bytree': 0.52906511605077866, 'subsample': 0.69868701223082708}
Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 705.7540
Function value obtained: 0.3981
Current minimum: 0.3737
Iteration No: 30 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.37112111657508395, 'n_estimators': 43, 'gamma': 0.0053502902660913856, 'min_child_weight': 9, 'reg_alpha': 0.00077267709658989541, 'reg_lambda': 0.00011074927268181327, 'colsample_bytree': 0.24766232428159443, 'subsample': 0.0186389970581223}
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 180.2526
Function value obtained: 0.4776
Current minimum: 0.3737
Iteration No: 31 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.34883131241455073, 'n_estimators': 200, 'gamma': 0.0018246144494534228, 'min_child_weight': 9, 'reg_alpha': 0.0036260185454120974, 'reg_lambda': 0.0041817864345479261, 'colsample_bytree': 0.026320962289236999, 'subsample': 0.18638703457040695}
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 120.4887
Function value obtained: 0.4567
Current minimum: 0.3737
Iteration No: 32 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.58367474175018064, 'n_estimators': 43, 'gamma': 1.2358745710311166e-05, 'min_child_weight': 9, 'reg_alpha': 0.00023404104943604038, 'reg_lambda': 0.010403693878551347, 'colsample_bytree': 0.02325520491730964, 'subsample': 0.015052665135797933}
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 36.2899
Function value obtained: 0.5347
Current minimum: 0.3737
Iteration No: 33 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.28522874884870059, 'n_estimators': 156, 'gamma': 0.026565194479298147, 'min_child_weight': 14, 'reg_alpha': 2.1500198824673943e-05, 'reg_lambda': 0.037625863516613851, 'colsample_bytree': 0.2692027682660163, 'subsample': 0.044447346518025295}
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 648.5132
Function value obtained: 0.4340
Current minimum: 0.3737
Iteration No: 34 started. Searching for the next optimal point.
{'max_depth': 3, 'learning_rate': 0.67771541698973647, 'n_estimators': 102, 'gamma': 2.4397865851261756e-05, 'min_child_weight': 22, 'reg_alpha': 0.00097981270095865726, 'reg_lambda': 0.0018876916301041884, 'colsample_bytree': 0.3478388926962846, 'subsample': 0.70758595886358344}
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 202.7389
Function value obtained: 0.4102
Current minimum: 0.3737
Iteration No: 35 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.51006561356720237, 'n_estimators': 215, 'gamma': 0.00049769168720053473, 'min_child_weight': 9, 'reg_alpha': 3.6033000122646244e-06, 'reg_lambda': 1.8003557287367282e-06, 'colsample_bytree': 0.71054609250030865, 'subsample': 0.026918950608804761}
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 1663.3281
Function value obtained: 1.8009
Current minimum: 0.3737
Iteration No: 36 started. Searching for the next optimal point.
{'max_depth': 15, 'learning_rate': 0.50374671955137251, 'n_estimators': 269, 'gamma': 0.33589907496272015, 'min_child_weight': 21, 'reg_alpha': 0.0054253412803497384, 'reg_lambda': 0.00099137819324499206, 'colsample_bytree': 0.01862415655940856, 'subsample': 0.026667444861411416}
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 148.8963
Function value obtained: 0.4592
Current minimum: 0.3737
Iteration No: 37 started. Searching for the next optimal point.
{'max_depth': 11, 'learning_rate': 0.49445551813807792, 'n_estimators': 260, 'gamma': 3.1001225759401958, 'min_child_weight': 19, 'reg_alpha': 0.068984765905792791, 'reg_lambda': 4.8861081345731737e-05, 'colsample_bytree': 0.096309072711133425, 'subsample': 0.46413779260398935}
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 484.1447
Function value obtained: 0.4091
Current minimum: 0.3737
Iteration No: 38 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.8212468626131606, 'n_estimators': 135, 'gamma': 0.0020442735813042848, 'min_child_weight': 13, 'reg_alpha': 0.00082583180204953797, 'reg_lambda': 0.00019956289053022855, 'colsample_bytree': 0.2445078970419238, 'subsample': 0.1184367300549489}
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 528.9599
Function value obtained: 0.5647
Current minimum: 0.3737
Iteration No: 39 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.32511539945905171, 'n_estimators': 272, 'gamma': 0.10606191166341367, 'min_child_weight': 13, 'reg_alpha': 0.012840198676758985, 'reg_lambda': 0.0013843105821113933, 'colsample_bytree': 0.029623084714611621, 'subsample': 0.095435878169969054}
Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 150.9976
Function value obtained: 0.4538
Current minimum: 0.3737
Iteration No: 40 started. Searching for the next optimal point.
{'max_depth': 10, 'learning_rate': 0.94071723506532168, 'n_estimators': 209, 'gamma': 0.020207259596293079, 'min_child_weight': 26, 'reg_alpha': 1.7231123705315531e-05, 'reg_lambda': 0.63655182718435632, 'colsample_bytree': 0.12989047076476359, 'subsample': 0.12224658350030361}
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 429.1508
Function value obtained: 0.4849
Current minimum: 0.3737
Iteration No: 41 started. Searching for the next optimal point.
{'max_depth': 6, 'learning_rate': 0.78022020758203248, 'n_estimators': 81, 'gamma': 1.5190566011412983, 'min_child_weight': 15, 'reg_alpha': 0.0038749478721906172, 'reg_lambda': 2.0988314994276132e-05, 'colsample_bytree': 0.74351523307754219, 'subsample': 0.022788171184734279}
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 336.8551
Function value obtained: 0.4559
Current minimum: 0.3737
Iteration No: 42 started. Searching for the next optimal point.
{'max_depth': 10, 'learning_rate': 0.94218302489949612, 'n_estimators': 254, 'gamma': 0.0025047215303876831, 'min_child_weight': 11, 'reg_alpha': 0.050863000523258303, 'reg_lambda': 0.14600677061450507, 'colsample_bytree': 0.022307415911164044, 'subsample': 0.075086345903369656}
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 135.0530
Function value obtained: 0.4774
Current minimum: 0.3737
Iteration No: 43 started. Searching for the next optimal point.
{'max_depth': 11, 'learning_rate': 0.71287190908507236, 'n_estimators': 121, 'gamma': 0.012065575191797134, 'min_child_weight': 11, 'reg_alpha': 4.9905747138293457e-05, 'reg_lambda': 0.00020211668783720192, 'colsample_bytree': 0.040079876629739379, 'subsample': 0.072490189273029443}
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 74.5315
Function value obtained: 0.4726
Current minimum: 0.3737
Iteration No: 44 started. Searching for the next optimal point.
{'max_depth': 15, 'learning_rate': 0.13116971909568365, 'n_estimators': 260, 'gamma': 9.9999999999999995e-07, 'min_child_weight': 11, 'reg_alpha': 0.15747391254342932, 'reg_lambda': 0.0048464388657250565, 'colsample_bytree': 1.0, 'subsample': 0.049802315371171829}
Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 3665.7687
Function value obtained: 0.3918
Current minimum: 0.3737
Iteration No: 45 started. Searching for the next optimal point.
{'max_depth': 11, 'learning_rate': 0.516963517177635, 'n_estimators': 238, 'gamma': 0.0060908490163179476, 'min_child_weight': 11, 'reg_alpha': 0.002840417704368678, 'reg_lambda': 6.715537023900604e-06, 'colsample_bytree': 0.093543240302383271, 'subsample': 0.028274420722338964}
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 353.1433
Function value obtained: 0.5063
Current minimum: 0.3737
Iteration No: 46 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.34344475423702203, 'n_estimators': 245, 'gamma': 0.94537140286310084, 'min_child_weight': 14, 'reg_alpha': 0.059046824151080403, 'reg_lambda': 0.00013387029363402017, 'colsample_bytree': 0.064742078008602336, 'subsample': 0.1357356459208498}
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 291.4766
Function value obtained: 0.4208
Current minimum: 0.3737
Iteration No: 47 started. Searching for the next optimal point.
{'max_depth': 11, 'learning_rate': 0.56065239902681507, 'n_estimators': 213, 'gamma': 0.0031335932669818854, 'min_child_weight': 12, 'reg_alpha': 0.18320312295750968, 'reg_lambda': 3.3323232142899595e-05, 'colsample_bytree': 0.021132283399102825, 'subsample': 0.13351624647328247}
Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 120.5349
Function value obtained: 0.4583
Current minimum: 0.3737
Iteration No: 48 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.37005032717685227, 'n_estimators': 198, 'gamma': 0.014028295975397998, 'min_child_weight': 12, 'reg_alpha': 0.00030269636555376502, 'reg_lambda': 0.0065484549381960664, 'colsample_bytree': 0.31060283966150343, 'subsample': 0.02547860032029365}
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 863.0543
Function value obtained: 0.5234
Current minimum: 0.3737
Iteration No: 49 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.57406486126919154, 'n_estimators': 186, 'gamma': 0.0094578529846498557, 'min_child_weight': 12, 'reg_alpha': 0.00031748372366040089, 'reg_lambda': 6.2935822390023571e-06, 'colsample_bytree': 0.069504044206397247, 'subsample': 0.1029628739502616}
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 230.7823
Function value obtained: 0.4674
Current minimum: 0.3737
Iteration No: 50 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.045335609565088864, 'n_estimators': 266, 'gamma': 0.0077906900179116499, 'min_child_weight': 13, 'reg_alpha': 0.00030555487923571774, 'reg_lambda': 0.00036923181230227333, 'colsample_bytree': 0.16699410389709932, 'subsample': 0.051242665560675943}
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 818.7367
Function value obtained: 0.4108
Current minimum: 0.3737
Iteration No: 51 started. Searching for the next optimal point.
{'max_depth': 14, 'learning_rate': 0.98053762594279037, 'n_estimators': 244, 'gamma': 0.011296718744688981, 'min_child_weight': 12, 'reg_alpha': 0.0055523898485123676, 'reg_lambda': 1.1144314564637932e-05, 'colsample_bytree': 0.10518960843274178, 'subsample': 0.060113058192798309}
Iteration No: 51 ended. Search finished for the next optimal point.
Time taken: 511.3860
Function value obtained: 1.1517
Current minimum: 0.3737
Iteration No: 52 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.67160613347360942, 'n_estimators': 170, 'gamma': 0.15816613865187829, 'min_child_weight': 14, 'reg_alpha': 0.10083884451685868, 'reg_lambda': 0.04021745945044139, 'colsample_bytree': 0.056895665102453065, 'subsample': 0.13926336762940264}
Iteration No: 52 ended. Search finished for the next optimal point.
Time taken: 159.3447
Function value obtained: 0.4643
Current minimum: 0.3737
Iteration No: 53 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.43321716211904537, 'n_estimators': 289, 'gamma': 0.025584392438432253, 'min_child_weight': 14, 'reg_alpha': 0.44679518925102107, 'reg_lambda': 0.00019120744800385969, 'colsample_bytree': 0.35701337631596902, 'subsample': 0.11444206697460181}
Iteration No: 53 ended. Search finished for the next optimal point.
Time taken: 1535.7174
Function value obtained: 0.4456
Current minimum: 0.3737
Iteration No: 54 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.2306580079310438, 'n_estimators': 214, 'gamma': 0.000118463237701864, 'min_child_weight': 14, 'reg_alpha': 0.11002962131935076, 'reg_lambda': 0.0021676028850710796, 'colsample_bytree': 0.081452017155650341, 'subsample': 0.17353851895417002}
Iteration No: 54 ended. Search finished for the next optimal point.
Time taken: 282.7638
Function value obtained: 0.4268
Current minimum: 0.3737
Iteration No: 55 started. Searching for the next optimal point.
{'max_depth': 6, 'learning_rate': 0.83963481794925376, 'n_estimators': 280, 'gamma': 1.5275023471128852, 'min_child_weight': 16, 'reg_alpha': 0.00066562990393302375, 'reg_lambda': 0.00034284576463672019, 'colsample_bytree': 0.046865906604191737, 'subsample': 0.019415926260079496}
Iteration No: 55 ended. Search finished for the next optimal point.
Time taken: 163.8107
Function value obtained: 0.4557
Current minimum: 0.3737
Iteration No: 56 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.34650994641210425, 'n_estimators': 190, 'gamma': 6.2373424556684007, 'min_child_weight': 14, 'reg_alpha': 0.19633093892687034, 'reg_lambda': 0.00011803696691211837, 'colsample_bytree': 0.20485664066002426, 'subsample': 0.055174154227073863}
Iteration No: 56 ended. Search finished for the next optimal point.
Time taken: 623.6719
Function value obtained: 0.4410
Current minimum: 0.3737
Iteration No: 57 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.40463454073736554, 'n_estimators': 192, 'gamma': 2.2089946878687896, 'min_child_weight': 9, 'reg_alpha': 0.0011139459596012505, 'reg_lambda': 0.3498885288672019, 'colsample_bytree': 0.025011904676616788, 'subsample': 0.76740140009088464}
Iteration No: 57 ended. Search finished for the next optimal point.
Time taken: 115.4938
Function value obtained: 0.4514
Current minimum: 0.3737
Iteration No: 58 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.43792304195217213, 'n_estimators': 219, 'gamma': 3.1260240825729384, 'min_child_weight': 13, 'reg_alpha': 0.0020940748190037889, 'reg_lambda': 0.03902176601505096, 'colsample_bytree': 0.12181704764639228, 'subsample': 0.12727151200202808}
Iteration No: 58 ended. Search finished for the next optimal point.
Time taken: 408.0404
Function value obtained: 0.4195
Current minimum: 0.3737
Iteration No: 59 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.72068496181928476, 'n_estimators': 289, 'gamma': 5.4738357569968796, 'min_child_weight': 14, 'reg_alpha': 0.0018341268125700511, 'reg_lambda': 7.1496390080007434e-05, 'colsample_bytree': 0.050973749840144487, 'subsample': 0.12393049272156241}
Iteration No: 59 ended. Search finished for the next optimal point.
Time taken: 232.5340
Function value obtained: 0.4472
Current minimum: 0.3737
Iteration No: 60 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.66881856611739665, 'n_estimators': 168, 'gamma': 0.00061835046132261134, 'min_child_weight': 14, 'reg_alpha': 0.007193002570352762, 'reg_lambda': 0.0016488372487645515, 'colsample_bytree': 0.024536011810128208, 'subsample': 0.47290590097590607}
Iteration No: 60 ended. Search finished for the next optimal point.
Time taken: 114.3104
Function value obtained: 0.4559
Current minimum: 0.3737
Iteration No: 61 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.077422713294810847, 'n_estimators': 218, 'gamma': 0.018945532309152562, 'min_child_weight': 13, 'reg_alpha': 0.00013968269684123377, 'reg_lambda': 2.7419397830802385e-05, 'colsample_bytree': 0.17491366005700706, 'subsample': 0.076252349146770634}
Iteration No: 61 ended. Search finished for the next optimal point.
Time taken: 676.9645
Function value obtained: 0.4065
Current minimum: 0.3737
Iteration No: 62 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.5373072401099076, 'n_estimators': 287, 'gamma': 0.00030978199468164648, 'min_child_weight': 14, 'reg_alpha': 0.00014777265686682263, 'reg_lambda': 0.0017113395346122497, 'colsample_bytree': 0.08945400434128048, 'subsample': 0.10588245733408787}
Iteration No: 62 ended. Search finished for the next optimal point.
Time taken: 480.7560
Function value obtained: 0.4642
Current minimum: 0.3737
Iteration No: 63 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.67063404003510674, 'n_estimators': 132, 'gamma': 0.025238768865926832, 'min_child_weight': 14, 'reg_alpha': 0.016500961202100026, 'reg_lambda': 1.3079877077582793e-06, 'colsample_bytree': 0.129933348709378, 'subsample': 0.086184101863115259}
Iteration No: 63 ended. Search finished for the next optimal point.
Time taken: 314.1365
Function value obtained: 0.5006
Current minimum: 0.3737
Iteration No: 64 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.22617632902631526, 'n_estimators': 177, 'gamma': 0.095847479923379594, 'min_child_weight': 14, 'reg_alpha': 0.0038753837400006706, 'reg_lambda': 0.00022921256237776899, 'colsample_bytree': 0.18387203462964183, 'subsample': 0.12141631825780287}
Iteration No: 64 ended. Search finished for the next optimal point.
Time taken: 623.0991
Function value obtained: 0.4069
Current minimum: 0.3737
Iteration No: 65 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.49103704564888234, 'n_estimators': 119, 'gamma': 0.32142568978842451, 'min_child_weight': 14, 'reg_alpha': 0.0012586407665607243, 'reg_lambda': 4.2032739408957124e-05, 'colsample_bytree': 0.036490958996254562, 'subsample': 0.18090744655081845}
Iteration No: 65 ended. Search finished for the next optimal point.
Time taken: 78.2676
Function value obtained: 0.4635
Current minimum: 0.3737
Iteration No: 66 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.051783281241270067, 'n_estimators': 178, 'gamma': 0.10506006822579421, 'min_child_weight': 14, 'reg_alpha': 2.1096019163322345e-06, 'reg_lambda': 0.0034760406893262772, 'colsample_bytree': 0.0785013162855627, 'subsample': 0.05512373106641099}
Iteration No: 66 ended. Search finished for the next optimal point.
Time taken: 222.9443
Function value obtained: 0.4815
Current minimum: 0.3737
Iteration No: 67 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.33140666798419488, 'n_estimators': 175, 'gamma': 0.025496829504396957, 'min_child_weight': 14, 'reg_alpha': 0.00010452280643271414, 'reg_lambda': 0.0044335048702558166, 'colsample_bytree': 0.11163208705936772, 'subsample': 0.10694426607228984}
Iteration No: 67 ended. Search finished for the next optimal point.
Time taken: 359.7064
Function value obtained: 0.4293
Current minimum: 0.3737
Iteration No: 68 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.38022669199652848, 'n_estimators': 247, 'gamma': 0.086161010829523432, 'min_child_weight': 14, 'reg_alpha': 0.0011160712557790437, 'reg_lambda': 0.00012858866815242386, 'colsample_bytree': 0.014741435962996267, 'subsample': 0.18801790685860992}
Iteration No: 68 ended. Search finished for the next optimal point.
Time taken: 147.4984
Function value obtained: 0.4519
Current minimum: 0.3737
Iteration No: 69 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.5322103231433496, 'n_estimators': 200, 'gamma': 0.012902336134580066, 'min_child_weight': 14, 'reg_alpha': 0.00024935286787434991, 'reg_lambda': 3.4843442028093515e-05, 'colsample_bytree': 0.084813140381450505, 'subsample': 0.055171973964725056}
Iteration No: 69 ended. Search finished for the next optimal point.
Time taken: 315.7106
Function value obtained: 0.4809
Current minimum: 0.3737
Iteration No: 70 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.75005254195411675, 'n_estimators': 262, 'gamma': 0.12296588436775634, 'min_child_weight': 14, 'reg_alpha': 2.5655666983163166e-05, 'reg_lambda': 7.2446692191585236e-05, 'colsample_bytree': 0.061483284236486906, 'subsample': 0.12316933888160052}
Iteration No: 70 ended. Search finished for the next optimal point.
Time taken: 330.6033
Function value obtained: 0.4830
Current minimum: 0.3737
Iteration No: 71 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.27173553532835976, 'n_estimators': 261, 'gamma': 0.21932517248722397, 'min_child_weight': 14, 'reg_alpha': 2.7811411277529145e-05, 'reg_lambda': 5.2761108188105193e-06, 'colsample_bytree': 0.018586928900150932, 'subsample': 0.05823161069915421}
Iteration No: 71 ended. Search finished for the next optimal point.
Time taken: 143.5469
Function value obtained: 0.4544
Current minimum: 0.3737
Iteration No: 72 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.45789420995555902, 'n_estimators': 267, 'gamma': 0.022286528570207381, 'min_child_weight': 14, 'reg_alpha': 0.13613340888126507, 'reg_lambda': 0.0003890248122265393, 'colsample_bytree': 0.038518144109649828, 'subsample': 0.068935114252807739}
Iteration No: 72 ended. Search finished for the next optimal point.
Time taken: 148.4610
Function value obtained: 0.4602
Current minimum: 0.3737
Iteration No: 73 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.75249552796727903, 'n_estimators': 255, 'gamma': 0.0020697054874403127, 'min_child_weight': 14, 'reg_alpha': 0.0026023023756958264, 'reg_lambda': 7.4437358935063648e-06, 'colsample_bytree': 0.024255061553844454, 'subsample': 0.047674198195996928}
Iteration No: 73 ended. Search finished for the next optimal point.
Time taken: 142.8545
Function value obtained: 0.4869
Current minimum: 0.3737
Iteration No: 74 started. Searching for the next optimal point.
{'max_depth': 15, 'learning_rate': 0.13092817839679541, 'n_estimators': 27, 'gamma': 9.9999999999999995e-07, 'min_child_weight': 11, 'reg_alpha': 6.0921160376468072e-06, 'reg_lambda': 4.2476497678946127e-05, 'colsample_bytree': 0.34865148272020668, 'subsample': 0.049802333545333803}
Iteration No: 74 ended. Search finished for the next optimal point.
Time taken: 211.2938
Function value obtained: 0.4387
Current minimum: 0.3737
Iteration No: 75 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.59948438210441191, 'n_estimators': 200, 'gamma': 0.0050178918072403822, 'min_child_weight': 14, 'reg_alpha': 0.00019086966717140881, 'reg_lambda': 4.6387721057071522e-05, 'colsample_bytree': 0.22608823511649054, 'subsample': 0.076445405120431895}
Iteration No: 75 ended. Search finished for the next optimal point.
Time taken: 753.7440
Function value obtained: 0.5268
Current minimum: 0.3737
Iteration No: 76 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.42597347417580211, 'n_estimators': 220, 'gamma': 1.2475031021376952, 'min_child_weight': 14, 'reg_alpha': 0.0018723962934562033, 'reg_lambda': 0.00047248498290176536, 'colsample_bytree': 0.38414966817908797, 'subsample': 0.17609530443397364}
Iteration No: 76 ended. Search finished for the next optimal point.
Time taken: 1317.5494
Function value obtained: 0.3995
Current minimum: 0.3737
Iteration No: 77 started. Searching for the next optimal point.
{'max_depth': 3, 'learning_rate': 0.68150132621884207, 'n_estimators': 223, 'gamma': 2.4507053285855902e-05, 'min_child_weight': 22, 'reg_alpha': 0.019742037061595327, 'reg_lambda': 1.6920625762820836e-06, 'colsample_bytree': 0.032492037055529818, 'subsample': 0.70743823204408851}
Iteration No: 77 ended. Search finished for the next optimal point.
Time taken: 113.3583
Function value obtained: 0.4454
Current minimum: 0.3737
Iteration No: 78 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.56561034087143525, 'n_estimators': 149, 'gamma': 1.2181597306990752, 'min_child_weight': 13, 'reg_alpha': 5.2276429838728754e-06, 'reg_lambda': 0.00022065620808587818, 'colsample_bytree': 0.042461299460447882, 'subsample': 0.034894024409337088}
Iteration No: 78 ended. Search finished for the next optimal point.
Time taken: 134.4145
Function value obtained: 0.4471
Current minimum: 0.3737
Iteration No: 79 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.68283213833723799, 'n_estimators': 206, 'gamma': 0.00052849811967770879, 'min_child_weight': 14, 'reg_alpha': 0.37756025788791892, 'reg_lambda': 0.00031766072136745724, 'colsample_bytree': 0.021710420664267503, 'subsample': 0.021037342247576547}
Iteration No: 79 ended. Search finished for the next optimal point.
Time taken: 124.0793
Function value obtained: 0.4877
Current minimum: 0.3737
Iteration No: 80 started. Searching for the next optimal point.
{'max_depth': 12, 'learning_rate': 0.67410656334870023, 'n_estimators': 107, 'gamma': 0.0057967545702116811, 'min_child_weight': 13, 'reg_alpha': 1.4146959718778384e-05, 'reg_lambda': 0.0095648314785466478, 'colsample_bytree': 0.011250911030114909, 'subsample': 0.094592521701009555}
Iteration No: 80 ended. Search finished for the next optimal point.
Time taken: 75.6888
Function value obtained: 0.4720
Current minimum: 0.3737
CPU times: user 9h 43min 53s, sys: 3min 47s, total: 9h 47min 41s
Wall time: 9h 48min 49s
In [ ]:
best = {'max_depth': 11, 'learning_rate': 0.21476942062372031, 'n_estimators': 287, 
         'gamma': 2.3074723022921413e-06, 'min_child_weight': 16, 'reg_alpha': 0.5815787682811846, 
         'reg_lambda': 0.60169547700624582, 'colsample_bytree': 0.78028984074973284, 'subsample': 0.36043975312396048}
In [ ]:
xgb = XGBRegressor(**best)
xgb.fit(X, y)
ypred = xgb.predict(Xtest)
In [ ]:
fig, ax = plt.subplots(1, figsize=(15,15))
plot_importance(xgb, ax=ax)

title

Submission

In [114]:
sub = pd.DataFrame(test.id)
sub['trip_duration'] = 10**ypred

sub.to_csv('my_sub.csv',index=False)

title

Notes

Ao fim das análises e submissão do kaggle, notei shortcomings que deveriam ser corrigidos para melhoria do modelo:

  • A análise deveria conter mais
  • Outliers foram retirados das análises, mas podem possuir counterparts no conjunto de teste.
    • As análises deveriam ser feitas com conjunto de treino E teste, quando possível.
    • Os outliers possivelmente devem ser inclusos no treino.
  • A engenharia de features foi muito crua
    • O horário do dia e dia da semana foram one-hot encoded para facilitar que o modelo encontrasse padrões de trânsito
    • Features mais bem elaboradas poderiam ser colocadas para melhorar o modelo, como:
      • 'é horário de trabalho', entre 7h e 19h
      • 'viagem para aeroporto', através das coordenadas dos aeroportos
      • 'viagem para manhattan', dropoff dentro de uma área definida por quatro retas no plano de latitude e longitude.

Os citado acima seriam os previstos próximos passos na análise desse dataset.

As maiores dificuldades encontradas nesse trabalho foram:

  • Organização da análise
    • Ordem das features a serem analisadas
    • Ordem das análises conjuntas de features (correlação, condicionadas, etc)
  • Forma de lidar com outliers
    • Análise de IQR não era satisfatória
    • Dificultam a análise e visualização das distribuições.
    • A forma encontrada foi principalmente manual, com justificativas heurísticas.